Loading in the libraries for regression.



In [1]:

    
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline

# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab









    



WARNING:traits.has_traits:DEPRECATED: traits.has_traits.wrapped_class, 'the 'implements' class advisor has been deprecated. Use the 'provides' class decorator.

Supervised Regression

Linear Regression



In [4]:

    
# Read in the data using 
Xy = pa.read_csv('Advertising.csv')



In [5]:

    
# Take a look at the contents.
Xy









    Out[5]:






  
    
      
      Unnamed: 0
      TV
      Radio
      Newspaper
      Sales
    
  
  
    
      0
      1
      230.1
      37.8
      69.2
      22.1
    
    
      1
      2
      44.5
      39.3
      45.1
      10.4
    
    
      2
      3
      17.2
      45.9
      69.3
      9.3
    
    
      3
      4
      151.5
      41.3
      58.5
      18.5
    
    
      4
      5
      180.8
      10.8
      58.4
      12.9
    
    
      5
      6
      8.7
      48.9
      75.0
      7.2
    
    
      6
      7
      57.5
      32.8
      23.5
      11.8
    
    
      7
      8
      120.2
      19.6
      11.6
      13.2
    
    
      8
      9
      8.6
      2.1
      1.0
      4.8
    
    
      9
      10
      199.8
      2.6
      21.2
      10.6
    
    
      10
      11
      66.1
      5.8
      24.2
      8.6
    
    
      11
      12
      214.7
      24.0
      4.0
      17.4
    
    
      12
      13
      23.8
      35.1
      65.9
      9.2
    
    
      13
      14
      97.5
      7.6
      7.2
      9.7
    
    
      14
      15
      204.1
      32.9
      46.0
      19.0
    
    
      15
      16
      195.4
      47.7
      52.9
      22.4
    
    
      16
      17
      67.8
      36.6
      114.0
      12.5
    
    
      17
      18
      281.4
      39.6
      55.8
      24.4
    
    
      18
      19
      69.2
      20.5
      18.3
      11.3
    
    
      19
      20
      147.3
      23.9
      19.1
      14.6
    
    
      20
      21
      218.4
      27.7
      53.4
      18.0
    
    
      21
      22
      237.4
      5.1
      23.5
      12.5
    
    
      22
      23
      13.2
      15.9
      49.6
      5.6
    
    
      23
      24
      228.3
      16.9
      26.2
      15.5
    
    
      24
      25
      62.3
      12.6
      18.3
      9.7
    
    
      25
      26
      262.9
      3.5
      19.5
      12.0
    
    
      26
      27
      142.9
      29.3
      12.6
      15.0
    
    
      27
      28
      240.1
      16.7
      22.9
      15.9
    
    
      28
      29
      248.8
      27.1
      22.9
      18.9
    
    
      29
      30
      70.6
      16.0
      40.8
      10.5
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      170
      171
      50.0
      11.6
      18.4
      8.4
    
    
      171
      172
      164.5
      20.9
      47.4
      14.5
    
    
      172
      173
      19.6
      20.1
      17.0
      7.6
    
    
      173
      174
      168.4
      7.1
      12.8
      11.7
    
    
      174
      175
      222.4
      3.4
      13.1
      11.5
    
    
      175
      176
      276.9
      48.9
      41.8
      27.0
    
    
      176
      177
      248.4
      30.2
      20.3
      20.2
    
    
      177
      178
      170.2
      7.8
      35.2
      11.7
    
    
      178
      179
      276.7
      2.3
      23.7
      11.8
    
    
      179
      180
      165.6
      10.0
      17.6
      12.6
    
    
      180
      181
      156.6
      2.6
      8.3
      10.5
    
    
      181
      182
      218.5
      5.4
      27.4
      12.2
    
    
      182
      183
      56.2
      5.7
      29.7
      8.7
    
    
      183
      184
      287.6
      43.0
      71.8
      26.2
    
    
      184
      185
      253.8
      21.3
      30.0
      17.6
    
    
      185
      186
      205.0
      45.1
      19.6
      22.6
    
    
      186
      187
      139.5
      2.1
      26.6
      10.3
    
    
      187
      188
      191.1
      28.7
      18.2
      17.3
    
    
      188
      189
      286.0
      13.9
      3.7
      15.9
    
    
      189
      190
      18.7
      12.1
      23.4
      6.7
    
    
      190
      191
      39.5
      41.1
      5.8
      10.8
    
    
      191
      192
      75.5
      10.8
      6.0
      9.9
    
    
      192
      193
      17.2
      4.1
      31.6
      5.9
    
    
      193
      194
      166.8
      42.0
      3.6
      19.6
    
    
      194
      195
      149.7
      35.6
      6.0
      17.3
    
    
      195
      196
      38.2
      3.7
      13.8
      7.6
    
    
      196
      197
      94.2
      4.9
      8.1
      9.7
    
    
      197
      198
      177.0
      9.3
      6.4
      12.8
    
    
      198
      199
      283.6
      42.0
      66.2
      25.5
    
    
      199
      200
      232.1
      8.6
      8.7
      13.4
    
  

200 rows × 5 columns



In [6]:

    
# Normalize data
# We do this to make plotting and processing easier.  Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization.  Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands.  One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
Xy









    Out[6]:






  
    
      
      Unnamed: 0
      TV
      Radio
      Newspaper
      Sales
    
  
  
    
      0
      0.000000
      0.775786
      0.762097
      0.605981
      0.807087
    
    
      1
      0.005025
      0.148123
      0.792339
      0.394019
      0.346457
    
    
      2
      0.010050
      0.055800
      0.925403
      0.606860
      0.303150
    
    
      3
      0.015075
      0.509976
      0.832661
      0.511873
      0.665354
    
    
      4
      0.020101
      0.609063
      0.217742
      0.510994
      0.444882
    
    
      5
      0.025126
      0.027054
      0.985887
      0.656992
      0.220472
    
    
      6
      0.030151
      0.192087
      0.661290
      0.204046
      0.401575
    
    
      7
      0.035176
      0.404126
      0.395161
      0.099384
      0.456693
    
    
      8
      0.040201
      0.026716
      0.042339
      0.006157
      0.125984
    
    
      9
      0.045226
      0.673318
      0.052419
      0.183817
      0.354331
    
    
      10
      0.050251
      0.221170
      0.116935
      0.210202
      0.275591
    
    
      11
      0.055276
      0.723706
      0.483871
      0.032542
      0.622047
    
    
      12
      0.060302
      0.078120
      0.707661
      0.576957
      0.299213
    
    
      13
      0.065327
      0.327359
      0.153226
      0.060686
      0.318898
    
    
      14
      0.070352
      0.687859
      0.663306
      0.401935
      0.685039
    
    
      15
      0.075377
      0.658438
      0.961694
      0.462621
      0.818898
    
    
      16
      0.080402
      0.226919
      0.737903
      1.000000
      0.429134
    
    
      17
      0.085427
      0.949273
      0.798387
      0.488127
      0.897638
    
    
      18
      0.090452
      0.231654
      0.413306
      0.158311
      0.381890
    
    
      19
      0.095477
      0.495773
      0.481855
      0.165347
      0.511811
    
    
      20
      0.100503
      0.736219
      0.558468
      0.467018
      0.645669
    
    
      21
      0.105528
      0.800473
      0.102823
      0.204046
      0.429134
    
    
      22
      0.110553
      0.042273
      0.320565
      0.433597
      0.157480
    
    
      23
      0.115578
      0.769699
      0.340726
      0.227792
      0.547244
    
    
      24
      0.120603
      0.208319
      0.254032
      0.158311
      0.318898
    
    
      25
      0.125628
      0.886710
      0.070565
      0.168865
      0.409449
    
    
      26
      0.130653
      0.480893
      0.590726
      0.108179
      0.527559
    
    
      27
      0.135678
      0.809604
      0.336694
      0.198769
      0.562992
    
    
      28
      0.140704
      0.839026
      0.546371
      0.198769
      0.681102
    
    
      29
      0.145729
      0.236388
      0.322581
      0.356201
      0.350394
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      170
      0.854271
      0.166723
      0.233871
      0.159191
      0.267717
    
    
      171
      0.859296
      0.553940
      0.421371
      0.414248
      0.507874
    
    
      172
      0.864322
      0.063916
      0.405242
      0.146878
      0.236220
    
    
      173
      0.869347
      0.567129
      0.143145
      0.109938
      0.397638
    
    
      174
      0.874372
      0.749746
      0.068548
      0.112577
      0.389764
    
    
      175
      0.879397
      0.934055
      0.985887
      0.364996
      1.000000
    
    
      176
      0.884422
      0.837673
      0.608871
      0.175901
      0.732283
    
    
      177
      0.889447
      0.573216
      0.157258
      0.306948
      0.397638
    
    
      178
      0.894472
      0.933378
      0.046371
      0.205805
      0.401575
    
    
      179
      0.899497
      0.557660
      0.201613
      0.152155
      0.433071
    
    
      180
      0.904523
      0.527224
      0.052419
      0.070361
      0.350394
    
    
      181
      0.909548
      0.736557
      0.108871
      0.238347
      0.417323
    
    
      182
      0.914573
      0.187690
      0.114919
      0.258575
      0.279528
    
    
      183
      0.919598
      0.970240
      0.866935
      0.628848
      0.968504
    
    
      184
      0.924623
      0.855935
      0.429435
      0.261214
      0.629921
    
    
      185
      0.929648
      0.690903
      0.909274
      0.169745
      0.826772
    
    
      186
      0.934673
      0.469395
      0.042339
      0.231310
      0.342520
    
    
      187
      0.939698
      0.643896
      0.578629
      0.157432
      0.618110
    
    
      188
      0.944724
      0.964829
      0.280242
      0.029903
      0.562992
    
    
      189
      0.949749
      0.060873
      0.243952
      0.203166
      0.200787
    
    
      190
      0.954774
      0.131214
      0.828629
      0.048373
      0.362205
    
    
      191
      0.959799
      0.252959
      0.217742
      0.050132
      0.326772
    
    
      192
      0.964824
      0.055800
      0.082661
      0.275286
      0.169291
    
    
      193
      0.969849
      0.561718
      0.846774
      0.029024
      0.708661
    
    
      194
      0.974874
      0.503889
      0.717742
      0.050132
      0.618110
    
    
      195
      0.979899
      0.126818
      0.074597
      0.118734
      0.236220
    
    
      196
      0.984925
      0.316199
      0.098790
      0.068602
      0.318898
    
    
      197
      0.989950
      0.596212
      0.187500
      0.053650
      0.440945
    
    
      198
      0.994975
      0.956713
      0.846774
      0.579595
      0.940945
    
    
      199
      1.000000
      0.782550
      0.173387
      0.073879
      0.464567
    
  

200 rows × 5 columns



In [7]:

    
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV']]
y = Xy.ix[:,['Sales']]



In [24]:

    
# Last time we did this by hand, now we are smarter and use the sklearn 
# routine.  This routine splits data into training and testing subsets.
cross_validation.train_test_split([1,2,3,4,5],
                                  [6,7,8,9,10],
                                  test_size=0.4,
                                  random_state=5)









    Out[24]:





[[2, 3, 4], [5, 1], [7, 8, 9], [10, 6]]



In [25]:

    
# Now we do it for the real data.
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8)



In [26]:

    
# Let's take a quick look at the data.
X_train



In [27]:

    
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)









    Out[27]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [28]:

    
# There are the slope and intercept of the line we computed.
# Beta_0
print reg.intercept_
# Beta_1
print reg.coef_









    



[ 0.19788723]
[[ 0.57698067]]



In [29]:

    
# Do a plot
plotX = np.linspace(0,1,100)
plotY = reg.predict(np.matrix(plotX).T)
py.plot(X_train,y_train,'ro')
py.plot(X_test,y_test,'go')
py.plot(plotX,plotY,'b-')









    Out[29]:





[<matplotlib.lines.Line2D at 0x173132b0>]



In [30]:

    
# Use the metrics package to print our errors.  See discussion on slides.
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))









    



training error
0.0181351494343
testing error
0.0159325184683

Back to slides.

Multi-dimensional regression



In [31]:

    
# Select out our predictor columns and our response columns
X = Xy.ix[:,['TV','Radio']]
y = Xy.ix[:,['Sales']]

# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
                                                                  y,
                                                                  test_size=0.8,
                                                                  random_state=123)



In [32]:

    
# Plot the data to get a feel for it.
mlab.clf()
mlab.points3d(X_train.ix[:,0]/X.ix[:,0].std(), 
              X_train.ix[:,1]/X.ix[:,1].std(), 
              y_train.ix[:,0]/y.ix[:,0].std(),
              color=(1,0,0), scale_factor=0.2)
mlab.points3d(X_test.ix[:,0]/X.ix[:,0].std(), 
              X_test.ix[:,1]/X.ix[:,1].std(), 
              y_test.ix[:,0]/y.ix[:,0].std(),
              color=(0,1,0), scale_factor=0.2)
mlab.axes()
mlab.show()



In [33]:

    
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)









    Out[33]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [34]:

    
# Create data for plotting
size=10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
                          np.linspace(0,1,size))
np.array([xPlot.flatten(),yPlot.flatten()])
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
                                           yPlot.flatten()])))
zPlot = zPlot.reshape([size,size])



In [35]:

    
# Since we will be plotting many times, we 
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
    mlab.clf()
    mlab.points3d(X_train.ix[:,0], 
                  X_train.ix[:,1], 
                  y_train.ix[:,0],
                  color=(1,0,0), scale_factor=scale_factor)
    mlab.points3d(X_test.ix[:,0], 
                  X_test.ix[:,1], 
                  y_test.ix[:,0],
                  color=(0,1,0), scale_factor=scale_factor)
    mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
    mlab.axes()
    mlab.show()
    
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)



In [36]:

    
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))









    



training error
0.00301813848558
testing error
0.00529973748665

Back to the notes.



In [ ]:



In [ ]:



In [ ]:

	Unnamed: 0	TV	Radio	Newspaper	Sales
0	1	230.1	37.8	69.2	22.1
1	2	44.5	39.3	45.1	10.4
2	3	17.2	45.9	69.3	9.3
3	4	151.5	41.3	58.5	18.5
4	5	180.8	10.8	58.4	12.9
5	6	8.7	48.9	75.0	7.2
6	7	57.5	32.8	23.5	11.8
7	8	120.2	19.6	11.6	13.2
8	9	8.6	2.1	1.0	4.8
9	10	199.8	2.6	21.2	10.6
10	11	66.1	5.8	24.2	8.6
11	12	214.7	24.0	4.0	17.4
12	13	23.8	35.1	65.9	9.2
13	14	97.5	7.6	7.2	9.7
14	15	204.1	32.9	46.0	19.0
15	16	195.4	47.7	52.9	22.4
16	17	67.8	36.6	114.0	12.5
17	18	281.4	39.6	55.8	24.4
18	19	69.2	20.5	18.3	11.3
19	20	147.3	23.9	19.1	14.6
20	21	218.4	27.7	53.4	18.0
21	22	237.4	5.1	23.5	12.5
22	23	13.2	15.9	49.6	5.6
23	24	228.3	16.9	26.2	15.5
24	25	62.3	12.6	18.3	9.7
25	26	262.9	3.5	19.5	12.0
26	27	142.9	29.3	12.6	15.0
27	28	240.1	16.7	22.9	15.9
28	29	248.8	27.1	22.9	18.9
29	30	70.6	16.0	40.8	10.5
...	...	...	...	...	...
170	171	50.0	11.6	18.4	8.4
171	172	164.5	20.9	47.4	14.5
172	173	19.6	20.1	17.0	7.6
173	174	168.4	7.1	12.8	11.7
174	175	222.4	3.4	13.1	11.5
175	176	276.9	48.9	41.8	27.0
176	177	248.4	30.2	20.3	20.2
177	178	170.2	7.8	35.2	11.7
178	179	276.7	2.3	23.7	11.8
179	180	165.6	10.0	17.6	12.6
180	181	156.6	2.6	8.3	10.5
181	182	218.5	5.4	27.4	12.2
182	183	56.2	5.7	29.7	8.7
183	184	287.6	43.0	71.8	26.2
184	185	253.8	21.3	30.0	17.6
185	186	205.0	45.1	19.6	22.6
186	187	139.5	2.1	26.6	10.3
187	188	191.1	28.7	18.2	17.3
188	189	286.0	13.9	3.7	15.9
189	190	18.7	12.1	23.4	6.7
190	191	39.5	41.1	5.8	10.8
191	192	75.5	10.8	6.0	9.9
192	193	17.2	4.1	31.6	5.9
193	194	166.8	42.0	3.6	19.6
194	195	149.7	35.6	6.0	17.3
195	196	38.2	3.7	13.8	7.6
196	197	94.2	4.9	8.1	9.7
197	198	177.0	9.3	6.4	12.8
198	199	283.6	42.0	66.2	25.5
199	200	232.1	8.6	8.7	13.4

	Unnamed: 0	TV	Radio	Newspaper	Sales
0	0.000000	0.775786	0.762097	0.605981	0.807087
1	0.005025	0.148123	0.792339	0.394019	0.346457
2	0.010050	0.055800	0.925403	0.606860	0.303150
3	0.015075	0.509976	0.832661	0.511873	0.665354
4	0.020101	0.609063	0.217742	0.510994	0.444882
5	0.025126	0.027054	0.985887	0.656992	0.220472
6	0.030151	0.192087	0.661290	0.204046	0.401575
7	0.035176	0.404126	0.395161	0.099384	0.456693
8	0.040201	0.026716	0.042339	0.006157	0.125984
9	0.045226	0.673318	0.052419	0.183817	0.354331
10	0.050251	0.221170	0.116935	0.210202	0.275591
11	0.055276	0.723706	0.483871	0.032542	0.622047
12	0.060302	0.078120	0.707661	0.576957	0.299213
13	0.065327	0.327359	0.153226	0.060686	0.318898
14	0.070352	0.687859	0.663306	0.401935	0.685039
15	0.075377	0.658438	0.961694	0.462621	0.818898
16	0.080402	0.226919	0.737903	1.000000	0.429134
17	0.085427	0.949273	0.798387	0.488127	0.897638
18	0.090452	0.231654	0.413306	0.158311	0.381890
19	0.095477	0.495773	0.481855	0.165347	0.511811
20	0.100503	0.736219	0.558468	0.467018	0.645669
21	0.105528	0.800473	0.102823	0.204046	0.429134
22	0.110553	0.042273	0.320565	0.433597	0.157480
23	0.115578	0.769699	0.340726	0.227792	0.547244
24	0.120603	0.208319	0.254032	0.158311	0.318898
25	0.125628	0.886710	0.070565	0.168865	0.409449
26	0.130653	0.480893	0.590726	0.108179	0.527559
27	0.135678	0.809604	0.336694	0.198769	0.562992
28	0.140704	0.839026	0.546371	0.198769	0.681102
29	0.145729	0.236388	0.322581	0.356201	0.350394
...	...	...	...	...	...
170	0.854271	0.166723	0.233871	0.159191	0.267717
171	0.859296	0.553940	0.421371	0.414248	0.507874
172	0.864322	0.063916	0.405242	0.146878	0.236220
173	0.869347	0.567129	0.143145	0.109938	0.397638
174	0.874372	0.749746	0.068548	0.112577	0.389764
175	0.879397	0.934055	0.985887	0.364996	1.000000
176	0.884422	0.837673	0.608871	0.175901	0.732283
177	0.889447	0.573216	0.157258	0.306948	0.397638
178	0.894472	0.933378	0.046371	0.205805	0.401575
179	0.899497	0.557660	0.201613	0.152155	0.433071
180	0.904523	0.527224	0.052419	0.070361	0.350394
181	0.909548	0.736557	0.108871	0.238347	0.417323
182	0.914573	0.187690	0.114919	0.258575	0.279528
183	0.919598	0.970240	0.866935	0.628848	0.968504
184	0.924623	0.855935	0.429435	0.261214	0.629921
185	0.929648	0.690903	0.909274	0.169745	0.826772
186	0.934673	0.469395	0.042339	0.231310	0.342520
187	0.939698	0.643896	0.578629	0.157432	0.618110
188	0.944724	0.964829	0.280242	0.029903	0.562992
189	0.949749	0.060873	0.243952	0.203166	0.200787
190	0.954774	0.131214	0.828629	0.048373	0.362205
191	0.959799	0.252959	0.217742	0.050132	0.326772
192	0.964824	0.055800	0.082661	0.275286	0.169291
193	0.969849	0.561718	0.846774	0.029024	0.708661
194	0.974874	0.503889	0.717742	0.050132	0.618110
195	0.979899	0.126818	0.074597	0.118734	0.236220
196	0.984925	0.316199	0.098790	0.068602	0.318898
197	0.989950	0.596212	0.187500	0.053650	0.440945
198	0.994975	0.956713	0.846774	0.579595	0.940945
199	1.000000	0.782550	0.173387	0.073879	0.464567

	TV
52	0.729456
119	0.063240
84	0.719648
173	0.567129
103	0.633074
13	0.327359
58	0.710517
18	0.231654
142	0.743321
82	0.252283
192	0.055800
16	0.226919
105	0.463984
133	0.740954
124	0.773757
1	0.148123
177	0.573216
126	0.024011
49	0.223876
187	0.643896
178	0.933378
59	0.710179
102	0.945215
155	0.011498
41	0.596212
91	0.094352
92	0.733852
20	0.736219
167	0.696990
186	0.469395
63	0.344944
17	0.949273
144	0.322962
150	0.946906
147	0.820088
33	0.895840
145	0.472100
199	0.782550
151	0.406831
164	0.393980